Author: Tomasz Karolewski
Promoter: Ph.D. Krzysztof Gogolewski
Data source: COVID-19 - Johns Hopkins University
predict_df
Kmeans, Hclust for pca
#@title
#Silhouette Score for K means
model = KMeans(random_state=100)
visualizer = KElbowVisualizer(model, k=(3,8),metric='silhouette', timings= True, size=(500, 300))
visualizer.fit(pca_df)
visualizer.show()
#Silhouette Score for Hierarchical clustering
model = AgglomerativeClustering(linkage='ward', affinity='euclidean')
visualizer = KElbowVisualizer(model, k=(3,8),metric='silhouette', timings= True, size=(500, 300))
visualizer.fit(pca_df)
visualizer.show()
#@title
#n+1 number of clusters
n=3
print("\033[1m" + f"{n+1} clusters: " + "\033[0m")
kmeans = KMeans(n_clusters=n+1, random_state=100).fit(data)
kmeans.labels_
clusters_list = []
#loop responsible for creating list of lists of countries splited by clustering and printing them
for iterator in range(n+1):
print(f"Cluster {iterator+1}: ")
cluster_list = []
for iterator2, country in enumerate(data.index):
if kmeans.labels_[iterator2] == iterator:
cluster_list.append(country)
print(cluster_list, end="\n")
clusters_list.append(cluster_list)
clusters_list_km_pca = clusters_list.copy()
#setting choropleth parameters
config = dict(
type = 'choropleth',
locations = data.index.values,
locationmode='country names',
z=np.append(kmeans.labels_+1, n+1).astype(int),
colorscale=[(0.00, "rgb(255, 196, 51)"), (0.25, "rgb(255, 196, 51)"),
(0.25, "rgb(255, 51, 119)"), (0.5, "rgb(255, 51, 119)"),
(0.5, "rgb(219, 51, 255)"), (0.75, "rgb(219, 51, 255)"),
(0.75, "rgb(51, 189, 255)"), (1.00, "rgb(51, 189, 255)")],
marker_line_color='black',
marker_line_width=0.5,
colorbar=dict(nticks=4, tickprefix='Cluster ')
)
config_coloraxis=dict(
tickvals=[el for el in range(1,n+2)],
title='Clusters',
ticks='outside'
)
config_margin=dict(
r=25,
t=25,
l=25,
b=25
)
#plotting first part of the figure
fig = go.Figure(data=[config])
fig.update_geos(scope='world', lataxis_showgrid=True, lonaxis_showgrid=True,
projection_type='mercator', lataxis_range=[40,75], lonaxis_range=[-30, 70],
lataxis_dtick=10, lonaxis_dtick=10, resolution=50)
fig.update_layout(height=350, width=500, margin=config_margin,
coloraxis_colorbar=config_coloraxis, title='Division of map according to k-means clustering', title_y=0.97)
display(Image(fig.to_image()))
#setting legend title parameters(broken method)
config_title=dict(
text='Countries',
x=0.875,
y=0.99,
font_size=13,
font_family='Arial'
)
#plotting second part of the figure
fig2 = subplots.make_subplots(rows=int(n+1), cols=1, subplot_titles=[f"Cluster {iterator + 1}" for iterator in range(len(clusters_list))], vertical_spacing=0.06)
for iterator, cluster in enumerate(clusters_list):
#calculating mean for each cluster
frame = pd.DataFrame(np.mean(europe_df_rolled[cluster], axis=1),
columns=['Mean'], index=europe_df_rolled.index)
for country in cluster:
#breaking long country names
if country == 'Bosnia and Herzegovina':
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name='Bosnia and<br> Herzegovina', legendgroup=iterator), row=int(iterator)+1, col=1)
else:
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name=country, legendgroup=iterator), row=int(iterator)+1, col=1)
fig2.update_xaxes(title_text='Date', row=int(iterator)+1, col=1)
fig2.update_yaxes(title_text='New cases per milion', row=int(iterator)+1, col=1, range=[0,2900])
#adding trace of mean
fig2.add_trace(go.Scatter(x=frame.index, y=frame['Mean'], name='Mean',
line=dict(color='black', width=3, dash='dash'), legendgroup=iterator, showlegend=False),
row=int(iterator)+1, col=1)
fig2.update_layout(height=1200, width=800, margin=config_margin, legend_tracegroupgap=55, title=config_title)
display(Image(fig2.to_image()))
print("\n")
#@title
#n+1 number of clusters
n=4
print("\033[1m" + f"{n+1} clusters: " + "\033[0m")
kmeans = AgglomerativeClustering(n_clusters=n+1, linkage='ward', affinity='euclidean').fit(data)
kmeans.labels_
clusters_list = []
#loop responsible for creating list of lists of countries splited by clustering and printing them
for iterator in range(n+1):
print(f"Cluster {iterator+1}: ")
cluster_list = []
for iterator2, country in enumerate(data.index):
if kmeans.labels_[iterator2] == iterator:
cluster_list.append(country)
print(cluster_list, end="\n")
clusters_list.append(cluster_list)
#setting choropleth parameters
config = dict(
type = 'choropleth',
locations = data.index.values,
locationmode='country names',
z=np.append(kmeans.labels_+1, n+1).astype(int),
colorscale=[(0.00, "rgb(255, 196, 51)"), (0.2, "rgb(255, 196, 51)"),
(0.2, "rgb(255, 51, 119)"), (0.4, "rgb(255, 51, 119)"),
(0.4, "rgb(219, 51, 255)"), (0.6, "rgb(219, 51, 255)"),
(0.6, "rgb(51, 189, 255)"), (0.8, "rgb(51, 189, 255)"),
(0.8, "rgb(51, 255, 53)"), (1.0, "rgb(51, 255, 53)")],
marker_line_color='black',
marker_line_width=0.5,
colorbar=dict(nticks=5, tickprefix='Cluster ')
)
config_coloraxis=dict(
tickvals=[el for el in range(1,n+2)],
title='Clusters',
ticks='outside'
)
config_margin=dict(
r=25,
t=25,
l=25,
b=25
)
#plotting first part of the figure
fig = go.Figure(data=[config])
fig.update_geos(scope='world', lataxis_showgrid=True, lonaxis_showgrid=True,
projection_type='mercator', lataxis_range=[40,75], lonaxis_range=[-30, 70],
lataxis_dtick=10, lonaxis_dtick=10, resolution=50)
fig.update_layout(height=350, width=500, margin=config_margin,
coloraxis_colorbar=config_coloraxis, title='Division of map according to hierarchical clustering', title_y=0.97)
display(Image(fig.to_image()))
#setting legend title parameters(broken method)
config_title=dict(
text='Countries',
x=0.875,
y=0.99,
font_size=13,
font_family='Arial'
)
#plotting second part of the figure
fig2 = subplots.make_subplots(rows=int(n+1), cols=1, subplot_titles=[f"Cluster {iterator + 1}" for iterator in range(len(clusters_list))], vertical_spacing=0.06)
for iterator, cluster in enumerate(clusters_list):
#calculating mean for each cluster
frame = pd.DataFrame(np.mean(europe_df_rolled[cluster], axis=1),
columns=['Mean'], index=europe_df_rolled.index)
for country in cluster:
#breaking long country names
if country == 'Bosnia and Herzegovina':
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name='Bosnia and<br> Herzegovina', legendgroup=iterator), row=int(iterator)+1, col=1)
else:
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name=country, legendgroup=iterator), row=int(iterator)+1, col=1)
fig2.update_xaxes(title_text='Date', row=int(iterator)+1, col=1)
fig2.update_yaxes(title_text='New cases per milion', row=int(iterator)+1, col=1, range=[0,2900])
#adding trace of mean
fig2.add_trace(go.Scatter(x=frame.index, y=frame['Mean'], name='Mean',
line=dict(color='black', width=3, dash='dash'), legendgroup=iterator, showlegend=False),
row=int(iterator)+1, col=1)
fig2.update_layout(height=1400, width=800, margin=config_margin, legend_tracegroupgap=55, title=config_title)
display(Image(fig2.to_image()))
print("\n")
Prediction
model = RandomForestRegressor(criterion='absolute_error', min_samples_leaf=5, n_estimators=500, oob_score=True, random_state=100)
#model = XGBRegressor(booster='gblinear', learning_rate=0.33, max_depth=1, min_child_weight=0, n_estimators=500, n_jobs=-1, objective='reg:squarederror', random_state=100)
model.fit(X_train_df, y_train_df.values.ravel())
predictions = pd.DataFrame(model.predict(predict_df.T), index=predict_df.T.index, columns=['predicted_right_base'])
validation = pd.DataFrame(model.predict(validate_df.T), index=validate_df.T.index, columns=['predicted_right_base'])
pd.concat([predict_df.T, predictions], axis=1)
#@title
waves_df = pd.DataFrame()
train_df = pd.DataFrame()
predict_df = pd.DataFrame()
validate_df = pd.DataFrame()
for country in europe_df_rolled.columns:
tmp_df = europe_df_rolled[country].copy().reset_index(drop=True)
#temporary changing first and last row for better peaks extraction
first_day = float(tmp_df.iloc[:1])
last_day = float(tmp_df.iloc[-1:])
tmp_df[0] = 0
tmp_df[len(tmp_df)-1] = 0
peaks, _ = find_peaks(tmp_df, prominence=35, distance=60)
results_full = peak_widths(tmp_df, peaks, rel_height=1, wlen=250)
results_full[0]
print("\n")
print("\033[1m peaks before removing subpeaks: \033[0m", peaks, sep=' ')
#removing subpeaks
if len(peaks) > 1:
index_to_del = []
#if first peak is inside right part of next peak
if results_full[2][0] >= results_full[2][1] and results_full[3][0] <= results_full[3][1]:
index_to_del.append(0)
for iterator in range(1, len(peaks)-1):
#if peak is inside left part of previous peak
if results_full[2][iterator] >= results_full[2][iterator-1] and results_full[3][iterator] <= results_full[3][iterator-1]:
index_to_del.append(iterator)
#if peak is inside right part of next peak
if results_full[2][iterator] >= results_full[2][iterator+1] and results_full[3][iterator] <= results_full[3][iterator+1]:
index_to_del.append(iterator)
iterator = len(peaks)-1
#if last peak is inside left part of previous peak
if results_full[2][iterator] >= results_full[2][iterator-1] and results_full[3][iterator] <= results_full[3][iterator-1]:
index_to_del.append(iterator)
index_to_del = list(set(index_to_del))
print("\033[1m subpeaks indexes: \033[0m", index_to_del, sep=' ')
peaks = np.delete(peaks, index_to_del)
results_full = np.delete(np.array(results_full), index_to_del, axis=1)
print("\033[1m peaks: \033[0m", peaks, sep=' ')
#calculating triangle points
base_left_point = dict()
base_left_point['x'] = results_full[2]
base_left_point['y'] = tmp_df[base_left_point['x'].astype(int)].values
base_right_point = dict()
base_right_point['x'] = results_full[3]
base_right_point['y'] = base_left_point['y']
peak_point = dict()
peak_point['x'] = peaks
peak_point['y'] = tmp_df[peaks].values
#calculating the length of the sides of the triangle
real_height = peak_point['y'] - base_left_point['y']
left_side_base = peak_point['x'] - base_left_point['x']
right_side_base = base_right_point['x'] - peak_point['x']
#calculating angles
left_angle = np.degrees(np.arctan(real_height/left_side_base))
right_angle = np.degrees(np.arctan(real_height/right_side_base))
#removing last right sides because this is unknown
right_side_base = right_side_base[:-1]
right_angle = right_angle[:-1]
print("\033[1m left angles: \033[0m", left_angle, sep=' ')
print("\033[1m right angles: \033[0m", right_angle, sep=' ')
#saving wave data
wave_column, train_column, predict_column, validate_column = create_wave_df_column(country, real_height, left_side_base, right_side_base, base_left_point['x'], base_left_point['y'])
waves_df = pd.merge(left=waves_df, right=wave_column, how="outer", left_index=True, right_index=True, sort=False)
train_df = pd.merge(left=train_df, right=train_column, how="outer", left_index=True, right_index=True, sort=False)
predict_df = pd.merge(left=predict_df, right=predict_column, how="outer", left_index=True, right_index=True, sort=False)
validate_df = pd.merge(left=validate_df, right=validate_column, how="outer", left_index=True, right_index=True, sort=False)
#returing original data for plotting
tmp_df[0] = first_day
tmp_df[len(tmp_df)-1] = last_day
#ploting
plt.plot(tmp_df, label='New cases')
plt.title(f"{country} - peaks and prediction", fontsize=16)
plt.ylim([0,3000])
plt.xlim([0,850])
plt.xlabel('Day', fontsize=13)
plt.ylabel('New cases per milion', fontsize=13)
for iterator in range(len(peaks)-1):
plt.plot([base_left_point['x'][iterator], peak_point['x'][iterator]],
[base_left_point['y'][iterator], peak_point['y'][iterator]],
color='black', linestyle='--')
plt.plot([peak_point['x'][iterator], base_right_point['x'][iterator]],
[peak_point['y'][iterator], base_right_point['y'][iterator]],
color='black', linestyle='--')
plt.plot([peak_point['x'][iterator], peak_point['x'][iterator]],
[base_left_point['y'][iterator], peak_point['y'][iterator]],
color='black', linestyle='--')
plt.hlines(base_left_point['y'][iterator],
base_left_point['x'][iterator],
base_right_point['x'][iterator],
color='black', linestyle='--')
#last peak only have left side because right is unknown
iterator = len(peaks)-1
plt.plot([base_left_point['x'][iterator], peak_point['x'][iterator]],
[base_left_point['y'][iterator], peak_point['y'][iterator]],
color='black', linestyle='--', label='Wave')
plt.plot([peak_point['x'][iterator], peak_point['x'][iterator]],
[base_left_point['y'][iterator], peak_point['y'][iterator]],
color='black', linestyle='--')
plt.hlines(base_left_point['y'][iterator],
base_left_point['x'][iterator],
peak_point['x'][iterator],
color='black', linestyle='--')
#predictions
plt.plot([peak_point['x'][iterator], peak_point['x'][iterator] + predictions.T[country]],
[peak_point['y'][iterator], base_left_point['y'][iterator]],
color='red', linestyle=(0, (5, 7)), label='Predicition')
plt.hlines(base_left_point['y'][iterator],
peak_point['x'][iterator],
peak_point['x'][iterator] + predictions.T[country],
color='red', linestyle=(0, (5, 7)))
#validation
iterator = len(peaks)-2
plt.plot([peak_point['x'][iterator], peak_point['x'][iterator] + validation.T[country]],
[peak_point['y'][iterator], base_left_point['y'][iterator]],
color='red', linestyle=(0, (5, 6)))
plt.hlines(base_left_point['y'][iterator],
peak_point['x'][iterator],
peak_point['x'][iterator] + validation.T[country],
color='red', linestyle=(0, (5, 6)))
plt.legend(frameon=True, framealpha=1, edgecolor='black')
plt.show()
Kmeans, hclust for triangles
#@title
#Silhouette Score for K means
model = KMeans(random_state=100)
visualizer = KElbowVisualizer(model, k=(3,8),metric='silhouette', timings= True, size=(500, 300))
visualizer.fit(waves_df.fillna(0).T)
visualizer.show()
#Silhouette Score for Hierarchical clustering
model = AgglomerativeClustering(linkage='ward', affinity='euclidean')
visualizer = KElbowVisualizer(model, k=(3,8),metric='silhouette', timings= True, size=(500, 300))
visualizer.fit(waves_df.fillna(0).T)
visualizer.show()
#@title
#n+1 number of clusters
n=2
print("\033[1m" + f"{n+1} clusters: " + "\033[0m")
kmeans = KMeans(n_clusters=n+1, random_state=100).fit(waves_df.fillna(0).T)
kmeans.labels_
clusters_list = []
#loop responsible for creating list of lists of countries splited by clustering and printing them
for iterator in range(n+1):
print(f"Cluster {iterator+1}: ")
cluster_list = []
for iterator2, country in enumerate(data.index):
if kmeans.labels_[iterator2] == iterator:
cluster_list.append(country)
print(cluster_list, end="\n")
clusters_list.append(cluster_list)
clusters_list_km_triangle = clusters_list.copy()
#setting choropleth parameters
config = dict(
type = 'choropleth',
locations = data.index.values,
locationmode='country names',
z=np.append(kmeans.labels_+1, n+1).astype(int),
colorscale=[(0.00, "rgb(255, 196, 51)"), (0.33, "rgb(255, 196, 51)"),
(0.33, "rgb(255, 51, 119)"), (0.66, "rgb(255, 51, 119)"),
(0.66, "rgb(219, 51, 255)"), (1.00, "rgb(219, 51, 255)")],
marker_line_color='black',
marker_line_width=0.5,
colorbar=dict(nticks=3, tickprefix='Cluster ')
)
config_coloraxis=dict(
tickvals=[el for el in range(1,n+2)],
title='Clusters',
ticks='outside'
)
config_margin=dict(
r=25,
t=25,
l=25,
b=25
)
#plotting first part of the figure
fig = go.Figure(data=[config])
fig.update_geos(scope='world', lataxis_showgrid=True, lonaxis_showgrid=True,
projection_type='mercator', lataxis_range=[40,75], lonaxis_range=[-30, 70],
lataxis_dtick=10, lonaxis_dtick=10, resolution=50)
fig.update_layout(height=350, width=500, margin=config_margin,
coloraxis_colorbar=config_coloraxis, title='Division of map according to k-means clustering', title_y=0.97)
display(Image(fig.to_image()))
#setting legend title parameters(broken method)
config_title=dict(
text='Countries',
x=0.875,
y=0.99,
font_size=13,
font_family='Arial'
)
#plotting second part of the figure
fig2 = subplots.make_subplots(rows=int(n+1), cols=1, subplot_titles=[f"Cluster {iterator + 1}" for iterator in range(len(clusters_list))], vertical_spacing=0.08)
for iterator, cluster in enumerate(clusters_list):
#calculating mean for each cluster
frame = pd.DataFrame(np.mean(europe_df_rolled[cluster], axis=1),
columns=['Mean'], index=europe_df_rolled.index)
for country in cluster:
#breaking long country names
if country == 'Bosnia and Herzegovina':
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name='Bosnia and<br> Herzegovina', legendgroup=iterator), row=int(iterator)+1, col=1)
else:
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name=country, legendgroup=iterator), row=int(iterator)+1, col=1)
fig2.update_xaxes(title_text='Date', row=int(iterator)+1, col=1)
fig2.update_yaxes(title_text='New cases per milion', row=int(iterator)+1, col=1, range=[0,2900])
#adding trace of mean
fig2.add_trace(go.Scatter(x=frame.index, y=frame['Mean'], name='Mean',
line=dict(color='black', width=3, dash='dash'), legendgroup=iterator, showlegend=False),
row=int(iterator)+1, col=1)
fig2.update_layout(height=1000, width=800, margin=config_margin, legend_tracegroupgap=30, title=config_title)
display(Image(fig2.to_image()))
print("\n")
#@title
#n+1 number of clusters
n=2
print("\033[1m" + f"{n+1} clusters: " + "\033[0m")
kmeans = AgglomerativeClustering(n_clusters=n+1, linkage='ward', affinity='euclidean').fit(waves_df.fillna(0).T)
kmeans.labels_
clusters_list = []
#loop responsible for creating list of lists of countries splited by clustering and printing them
for iterator in range(n+1):
print(f"Cluster {iterator+1}: ")
cluster_list = []
for iterator2, country in enumerate(data.index):
if kmeans.labels_[iterator2] == iterator:
cluster_list.append(country)
print(cluster_list, end="\n")
clusters_list.append(cluster_list)
#setting choropleth parameters
config = dict(
type = 'choropleth',
locations = data.index.values,
locationmode='country names',
z=np.append(kmeans.labels_+1, n+1).astype(int),
colorscale=[(0.00, "rgb(255, 196, 51)"), (0.33, "rgb(255, 196, 51)"),
(0.33, "rgb(255, 51, 119)"), (0.66, "rgb(255, 51, 119)"),
(0.66, "rgb(219, 51, 255)"), (1.00, "rgb(219, 51, 255)")],
marker_line_color='black',
marker_line_width=0.5,
colorbar=dict(nticks=3, tickprefix='Cluster ')
)
config_coloraxis=dict(
tickvals=[el for el in range(1,n+2)],
title='Clusters',
ticks='outside'
)
config_margin=dict(
r=25,
t=25,
l=25,
b=25
)
#plotting first part of the figure
fig = go.Figure(data=[config])
fig.update_geos(scope='world', lataxis_showgrid=True, lonaxis_showgrid=True,
projection_type='mercator', lataxis_range=[40,75], lonaxis_range=[-30, 70],
lataxis_dtick=10, lonaxis_dtick=10, resolution=50)
fig.update_layout(height=350, width=500, margin=config_margin,
coloraxis_colorbar=config_coloraxis, title='Division of map according to hierarchical clustering', title_y=0.97)
display(Image(fig.to_image()))
#setting legend title parameters(broken method)
config_title=dict(
text='Countries',
x=0.875,
y=0.99,
font_size=13,
font_family='Arial'
)
#plotting second part of the figure
fig2 = subplots.make_subplots(rows=int(n+1), cols=1, subplot_titles=[f"Cluster {iterator + 1}" for iterator in range(len(clusters_list))], vertical_spacing=0.08)
for iterator, cluster in enumerate(clusters_list):
#calculating mean for each cluster
frame = pd.DataFrame(np.mean(europe_df_rolled[cluster], axis=1),
columns=['Mean'], index=europe_df_rolled.index)
for country in cluster:
#breaking long country names
if country == 'Bosnia and Herzegovina':
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name='Bosnia and<br> Herzegovina', legendgroup=iterator), row=int(iterator)+1, col=1)
else:
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name=country, legendgroup=iterator), row=int(iterator)+1, col=1)
fig2.update_xaxes(title_text='Date', row=int(iterator)+1, col=1)
fig2.update_yaxes(title_text='New cases per milion', row=int(iterator)+1, col=1, range=[0,2900])
#adding trace of mean
fig2.add_trace(go.Scatter(x=frame.index, y=frame['Mean'], name='Mean',
line=dict(color='black', width=3, dash='dash'), legendgroup=iterator, showlegend=False),
row=int(iterator)+1, col=1)
fig2.update_layout(height=1000, width=800, margin=config_margin, legend_tracegroupgap=30, title=config_title)
display(Image(fig2.to_image()))
print("\n")
Kmeans, hclust for pca+triangles
#@title
#Silhouette Score for K means
model = KMeans(random_state=100)
visualizer = KElbowVisualizer(model, k=(3,8),metric='silhouette', timings= True, size=(500, 300))
visualizer.fit(pca_waves_df)
visualizer.show()
#Silhouette Score for Hierarchical clustering
model = AgglomerativeClustering(linkage='ward', affinity='euclidean')
visualizer = KElbowVisualizer(model, k=(3,8),metric='silhouette', timings= True, size=(500, 300))
visualizer.fit(pca_waves_df)
visualizer.show()
#@title
#n+1 number of clusters
n=4
print("\033[1m" + f"{n+1} clusters: " + "\033[0m")
kmeans = KMeans(n_clusters=n+1, random_state=100).fit(pca_waves_df)
kmeans.labels_
clusters_list = []
#loop responsible for creating list of lists of countries splited by clustering and printing them
for iterator in range(n+1):
print(f"Cluster {iterator+1}: ")
cluster_list = []
for iterator2, country in enumerate(data.index):
if kmeans.labels_[iterator2] == iterator:
cluster_list.append(country)
print(cluster_list, end="\n")
clusters_list.append(cluster_list)
clusters_list_km_pca_triangle = clusters_list.copy()
#setting choropleth parameters
config = dict(
type = 'choropleth',
locations = data.index.values,
locationmode='country names',
z=np.append(kmeans.labels_+1, n+1).astype(int),
colorscale=[(0.00, "rgb(255, 196, 51)"), (0.2, "rgb(255, 196, 51)"),
(0.2, "rgb(255, 51, 119)"), (0.4, "rgb(255, 51, 119)"),
(0.4, "rgb(219, 51, 255)"), (0.6, "rgb(219, 51, 255)"),
(0.6, "rgb(51, 189, 255)"), (0.8, "rgb(51, 189, 255)"),
(0.8, "rgb(51, 255, 53)"), (1.0, "rgb(51, 255, 53)")],
marker_line_color='black',
marker_line_width=0.5,
colorbar=dict(nticks=5, tickprefix='Cluster ')
)
config_coloraxis=dict(
tickvals=[el for el in range(1,n+2)],
title='Clusters',
ticks='outside'
)
config_margin=dict(
r=25,
t=25,
l=25,
b=25
)
#plotting first part of the figure
fig = go.Figure(data=[config])
fig.update_geos(scope='world', lataxis_showgrid=True, lonaxis_showgrid=True,
projection_type='mercator', lataxis_range=[40,75], lonaxis_range=[-30, 70],
lataxis_dtick=10, lonaxis_dtick=10, resolution=50)
fig.update_layout(height=350, width=500, margin=config_margin,
coloraxis_colorbar=config_coloraxis, title='Division of map according to k-means clustering', title_y=0.97)
display(Image(fig.to_image()))
#setting legend title parameters(broken method)
config_title=dict(
text='Countries',
x=0.875,
y=0.99,
font_size=13,
font_family='Arial'
)
#plotting second part of the figure
fig2 = subplots.make_subplots(rows=int(n+1), cols=1, subplot_titles=[f"Cluster {iterator + 1}" for iterator in range(len(clusters_list))], vertical_spacing=0.06)
for iterator, cluster in enumerate(clusters_list):
#calculating mean for each cluster
frame = pd.DataFrame(np.mean(europe_df_rolled[cluster], axis=1),
columns=['Mean'], index=europe_df_rolled.index)
for country in cluster:
#breaking long country names
if country == 'Bosnia and Herzegovina':
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name='Bosnia and<br> Herzegovina', legendgroup=iterator), row=int(iterator)+1, col=1)
else:
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name=country, legendgroup=iterator), row=int(iterator)+1, col=1)
fig2.update_xaxes(title_text='Date', row=int(iterator)+1, col=1)
fig2.update_yaxes(title_text='New cases per milion', row=int(iterator)+1, col=1, range=[0,2900])
#adding trace of mean
fig2.add_trace(go.Scatter(x=frame.index, y=frame['Mean'], name='Mean',
line=dict(color='black', width=3, dash='dash'), legendgroup=iterator, showlegend=False),
row=int(iterator)+1, col=1)
fig2.update_layout(height=1400, width=800, margin=config_margin, legend_tracegroupgap=55, title=config_title)
display(Image(fig2.to_image()))
print("\n")
#@title
#n+1 number of clusters
n=4
print("\033[1m" + f"{n+1} clusters: " + "\033[0m")
kmeans = AgglomerativeClustering(n_clusters=n+1, linkage='ward', affinity='euclidean').fit(pca_waves_df)
kmeans.labels_
clusters_list = []
#loop responsible for creating list of lists of countries splited by clustering and printing them
for iterator in range(n+1):
print(f"Cluster {iterator+1}: ")
cluster_list = []
for iterator2, country in enumerate(data.index):
if kmeans.labels_[iterator2] == iterator:
cluster_list.append(country)
print(cluster_list, end="\n")
clusters_list.append(cluster_list)
#setting choropleth parameters
config = dict(
type = 'choropleth',
locations = data.index.values,
locationmode='country names',
z=np.append(kmeans.labels_+1, n+1).astype(int),
colorscale=[(0.00, "rgb(255, 196, 51)"), (0.2, "rgb(255, 196, 51)"),
(0.2, "rgb(255, 51, 119)"), (0.4, "rgb(255, 51, 119)"),
(0.4, "rgb(219, 51, 255)"), (0.6, "rgb(219, 51, 255)"),
(0.6, "rgb(51, 189, 255)"), (0.8, "rgb(51, 189, 255)"),
(0.8, "rgb(51, 255, 53)"), (1.0, "rgb(51, 255, 53)")],
marker_line_color='black',
marker_line_width=0.5,
colorbar=dict(nticks=5, tickprefix='Cluster ')
)
config_coloraxis=dict(
tickvals=[el for el in range(1,n+2)],
title='Clusters',
ticks='outside'
)
config_margin=dict(
r=25,
t=25,
l=25,
b=25
)
#plotting first part of the figure
fig = go.Figure(data=[config])
fig.update_geos(scope='world', lataxis_showgrid=True, lonaxis_showgrid=True,
projection_type='mercator', lataxis_range=[40,75], lonaxis_range=[-30, 70],
lataxis_dtick=10, lonaxis_dtick=10, resolution=50)
fig.update_layout(height=350, width=500, margin=config_margin,
coloraxis_colorbar=config_coloraxis, title='Division of map according to hierarchical clustering', title_y=0.97)
display(Image(fig.to_image()))
#setting legend title parameters(broken method)
config_title=dict(
text='Countries',
x=0.875,
y=0.99,
font_size=13,
font_family='Arial'
)
#plotting second part of the figure
fig2 = subplots.make_subplots(rows=int(n+1), cols=1, subplot_titles=[f"Cluster {iterator + 1}" for iterator in range(len(clusters_list))], vertical_spacing=0.06)
for iterator, cluster in enumerate(clusters_list):
#calculating mean for each cluster
frame = pd.DataFrame(np.mean(europe_df_rolled[cluster], axis=1),
columns=['Mean'], index=europe_df_rolled.index)
for country in cluster:
#breaking long country names
if country == 'Bosnia and Herzegovina':
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name='Bosnia and<br> Herzegovina', legendgroup=iterator), row=int(iterator)+1, col=1)
else:
fig2.add_trace(go.Scatter(x=europe_df_rolled.index, y=europe_df_rolled[country], name=country, legendgroup=iterator), row=int(iterator)+1, col=1)
fig2.update_xaxes(title_text='Date', row=int(iterator)+1, col=1)
fig2.update_yaxes(title_text='New cases per milion', row=int(iterator)+1, col=1, range=[0,2900])
#adding trace of mean
fig2.add_trace(go.Scatter(x=frame.index, y=frame['Mean'], name='Mean',
line=dict(color='black', width=3, dash='dash'), legendgroup=iterator, showlegend=False),
row=int(iterator)+1, col=1)
fig2.update_layout(height=1400, width=800, margin=config_margin, legend_tracegroupgap=55, title=config_title)
display(Image(fig2.to_image()))
print("\n")